import os
import pandas as pd
import numpy as np
import random
import holidays
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from tqdm import tqdm
from scipy.signal import find_peaks, detrend
import plotly.express as px
from IPython.display import display
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, make_scorer
from sklearn.metrics import root_mean_squared_error , mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
def describe_time_series(df , col = None , hist_margin = 'rug' , hist_on = False ) :
### takes a dataframe with a datetime index
### returns
## summary statistics
## plots
col_ts = col if col != None else df.columns[0]
ts = df.loc[:, [col_ts] ]
## CHECK TIMESTAMPS
continous_index = pd.date_range ( ts.index.min() , ts.index.max() , freq = ts.index.freq)
continous_dates_df = pd.DataFrame(index = continous_index ,
data = ({'cont_dates': continous_index }) )
measures_per_timestamp = continous_dates_df.merge(ts , left_index= True , right_index= True , how = 'left').groupby('cont_dates').count()
measures_per_timestamp['problem'] = measures_per_timestamp[measures_per_timestamp != 1 ]
print(' DUPLICATE OR MISSING TIMESTAMPS' , measures_per_timestamp[measures_per_timestamp.problem == 1 ])
## CHECK NULL VALUES
print ('MISSING VALUES in ', ts.isnull().sum()[0], ',', (ts.isnull().sum()/ts.shape[0])[0] , ' % TIMESTEPS' )
print( 'NUMBER OF TIMESTEPS' , ts.shape[0])
## PLOT TIMESERIES
line_plot = px.line( data_frame= ts ,
x = ts.index ,
y = str(col_ts) ,
title = 'LINE PLOT ' + str(col_ts) )
## DISTRIBUTION PLOT
histo = px.histogram( ts, x= str(col_ts), marginal=hist_margin ) # can be `box`, `violin`
## CHECK
yearly_summary = ts.resample('YS').agg( ['sum' ,'mean','min' , 'max' , 'std'])
line_plot.show()
if hist_on == True:
histo.show()
print(yearly_summary)
return
def extract_time_features( df_in , holidays_on = True , feature_format = 'linear' ):
## takes a dataframe with the index as a timeseries
## gets back time related features in the dataframe
## parameters : feature format : linear or sinusoidal
df = df_in.copy(deep=True)
df['is_weekend'] = (df.index.weekday > 4).astype(int)
if holidays_on == True:
country_holidays = holidays.Belgium()
df['dates'] = df.index
df['is_holiday'] = df.dates.apply( lambda x: x in country_holidays).astype(int)
df.drop( columns = 'dates' , inplace = True )
if feature_format == 'linear':
df['hour'] = df.index.hour
df['month'] = df.index.month
df['weekday'] = df.index.weekday
if feature_format == 'sinusoidal':
df['hour_cos'] = np.cos( ( df.index.hour ) / 23 * 2 * np.pi )
df['hour_sin'] = np.sin( ( df.index.hour ) / 23 * 2 * np.pi )
df['month_cos'] = np.cos( df.index.month / 11 * 2 * np.pi )
df['month_sin'] = np.sin( df.index.month / 11 * 2 * np.pi )
df['weekday_cos'] = np.cos( (df.index.weekday ) / 6 * 2 * np.pi )
df['weekday_sin'] = np.sin( (df.index.weekday ) / 6 * 2 * np.pi )
return df
def create_lags(df_in , col = None , n_lags = [ 96 , 96*2 ] , na_strategy = 'bfill' , verbose = False ):
## Takes a dataframe and a column and a list of lags to be performed
## strategy to handle na_shall_also_be_specified
df = df_in.copy(deep = True )
col_ts = col if col != None else df.columns[0]
for l in n_lags:
df['lag_'+str(l)+'_'+col_ts] = df[col_ts].shift(l)
df.fillna( method = na_strategy , inplace = True )
if verbose == True :
print(df.columns)
return df
def plot_all_columns( df_input , graph_title ) :
fig = go.Figure()
for c in df_input.columns:
fig.add_trace( go.Scatter( x = df_input.index , y = df_input[c] , name = str(c) ))
fig.update_layout(title = graph_title)
return fig.show()
##df_index = pd.date_range('2019-01-01' ,'2022-01-01' , freq = '15min' )
##df_values = 10*( np.sin( df_index.hour ))
##df = pd.DataFrame( index = df_index , data = {'measure' : 1000*( np.sin( df_index.hour )),
## 'temperature' : 4*( np.cos( df_index.hour )) })
PAR= { 'TARGET_VALUE' : 'load' ,
'TRAIN_START' : '2019-01-01' ,
'TRAIN_END' : '2023-01-01' ,
'TEST_END' : '2023-12-31'}
Le dataset¶
Entsoe: données de consommation des réseaux europeens, disponibles d'une maniere publique. Use case: Charge electrique Suisse
https://transparency.entsoe.eu/load-domain/r2/totalLoadR2/show
# Input
load = pd.read_csv('..\\data\\curated_data\\load_clean.csv', parse_dates = [2])
df = load.copy(deep = True)
df.set_index('dt' , inplace = True)
df.drop( columns= 'load_forecast',inplace = True)
df.head()
| load | |
|---|---|
| dt | |
| 2019-01-01 00:00:00 | 7037.0 |
| 2019-01-01 01:00:00 | 7096.0 |
| 2019-01-01 02:00:00 | 7244.0 |
| 2019-01-01 03:00:00 | 7443.0 |
| 2019-01-01 04:00:00 | 7353.0 |
describe_time_series(df, col = 'load')
DUPLICATE OR MISSING TIMESTAMPS Empty DataFrame Columns: [load, problem] Index: [] MISSING VALUES in 0 , 0.0 % TIMESTEPS NUMBER OF TIMESTEPS 50393
C:\Users\Patrizio\AppData\Local\Temp\ipykernel_20132\3123582690.py:21: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
print ('MISSING VALUES in ', ts.isnull().sum()[0], ',', (ts.isnull().sum()/ts.shape[0])[0] , ' % TIMESTEPS' )
load
sum mean min max std
dt
2019-01-01 63399414.0 7238.202306 4333.0 9924.0 981.167076
2020-01-01 62411627.0 7105.957759 4704.0 9874.0 949.418974
2021-01-01 63519856.0 7251.952963 4377.0 10242.0 1073.346090
2022-01-01 64614057.0 7376.876013 4510.0 10189.0 981.156288
2023-01-01 61054726.0 6971.309203 3747.0 10013.0 972.669413
2024-01-01 44015287.0 6694.340228 3746.0 9953.0 1128.948133
df.head()
| load | |
|---|---|
| dt | |
| 2019-01-01 00:00:00 | 7037.0 |
| 2019-01-01 01:00:00 | 7096.0 |
| 2019-01-01 02:00:00 | 7244.0 |
| 2019-01-01 03:00:00 | 7443.0 |
| 2019-01-01 04:00:00 | 7353.0 |
Facteurs de dependence¶
prep_df =( df.pipe( extract_time_features , feature_format = 'sinusoidal' )
.pipe( create_lags , col = PAR['TARGET_VALUE'], n_lags = [24*7, 24*1,24*2] ))
C:\Users\Patrizio\AppData\Local\Temp\ipykernel_20132\2393523276.py:14: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
prep_df.tail()
| load | is_weekend | is_holiday | hour_cos | hour_sin | month_cos | month_sin | weekday_cos | weekday_sin | lag_168_load | lag_24_load | lag_48_load | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dt | ||||||||||||
| 2024-09-30 19:00:00 | 6736.0 | 0 | 0 | 0.460065 | -8.878852e-01 | 0.415415 | -0.909632 | 1.0 | 0.0 | 6971.0 | 5820.0 | 6268.0 |
| 2024-09-30 20:00:00 | 6876.0 | 0 | 0 | 0.682553 | -7.308360e-01 | 0.415415 | -0.909632 | 1.0 | 0.0 | 6898.0 | 6138.0 | 6158.0 |
| 2024-09-30 21:00:00 | 6549.0 | 0 | 0 | 0.854419 | -5.195840e-01 | 0.415415 | -0.909632 | 1.0 | 0.0 | 6543.0 | 5926.0 | 5896.0 |
| 2024-09-30 22:00:00 | 6672.0 | 0 | 0 | 0.962917 | -2.697968e-01 | 0.415415 | -0.909632 | 1.0 | 0.0 | 6084.0 | 5796.0 | 5582.0 |
| 2024-09-30 23:00:00 | 6287.0 | 0 | 0 | 1.000000 | -2.449294e-16 | 0.415415 | -0.909632 | 1.0 | 0.0 | 5793.0 | 6172.0 | 5481.0 |
plot_all_columns(prep_df[prep_df.index > '2020-04-01 00:00:00'].head(96 * 7) , graph_title= 'Features pour une semaine ')
px.bar( prep_df.corr()['load'].sort_values() , title = 'Correlations avec Consommation MW' )
#df['temperature'] = 12 + 10 * np.cos( df.index.month / 12 * 2 * np.pi ) + 3 * np.sin( df.index.hour / 24 * 2 * np.pi ) + 3*random.uniform(-1, 1)
#describe_time_series( df , col = 'temperature')
Separer dataset train et test¶
train = prep_df[ (prep_df.index >= PAR['TRAIN_START']) & (prep_df.index <= PAR['TRAIN_END'])]
test = prep_df[ (df.index > PAR['TRAIN_END'] ) & ( prep_df.index < PAR['TEST_END'])]
y_train = train[PAR['TARGET_VALUE']]
X_train = train.drop(columns = PAR['TARGET_VALUE'] )
y_test = test[PAR['TARGET_VALUE']]
X_test = test.drop(columns = PAR['TARGET_VALUE'] )
Attention¶
Le referentiel de temps choisi ici c'est le temps "futur"--> le moment dans le futur ou le load est consommé
L'autre approche possible c'est le referentiel du "moment de la prevision " dans notre cas 24 heures avant le moment de la consommation. Pour simplicitè et facilitè de visualisation on choisi le referentiel "futur"
X_train.tail()
| is_weekend | is_holiday | hour_cos | hour_sin | month_cos | month_sin | weekday_cos | weekday_sin | lag_168_load | lag_24_load | lag_48_load | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| dt | |||||||||||
| 2022-12-31 20:00:00 | 1 | 0 | 0.682553 | -7.308360e-01 | 0.841254 | 0.540641 | 0.5 | -8.660254e-01 | 6750.0 | 7125.0 | 7124.0 |
| 2022-12-31 21:00:00 | 1 | 0 | 0.854419 | -5.195840e-01 | 0.841254 | 0.540641 | 0.5 | -8.660254e-01 | 6598.0 | 6997.0 | 6997.0 |
| 2022-12-31 22:00:00 | 1 | 0 | 0.962917 | -2.697968e-01 | 0.841254 | 0.540641 | 0.5 | -8.660254e-01 | 6547.0 | 7779.0 | 6969.0 |
| 2022-12-31 23:00:00 | 1 | 0 | 1.000000 | -2.449294e-16 | 0.841254 | 0.540641 | 0.5 | -8.660254e-01 | 6911.0 | 7872.0 | 6667.0 |
| 2023-01-01 00:00:00 | 1 | 1 | 1.000000 | 0.000000e+00 | 0.841254 | 0.540641 | 1.0 | -2.449294e-16 | 6946.0 | 7896.0 | 6904.0 |
y_train.tail()
dt 2022-12-31 20:00:00 6735.0 2022-12-31 21:00:00 6988.0 2022-12-31 22:00:00 7656.0 2022-12-31 23:00:00 7691.0 2023-01-01 00:00:00 7667.0 Name: load, dtype: float64
fig = go.Figure()
fig.add_trace( go.Scatter( x = y_train.index , y = y_train.values , name = 'Train'))
fig.add_trace( go.Scatter( x = y_test.index , y = y_test.values , name = 'Test'))
fig.update_layout( title = 'TRAIN - TEST')
fig.show()
Avant des modeles complexes: une baseline solide !¶
On note une periodicité horaire et une difference jour et jour un modele simple mais efficace pourrait etre que la consommation de demain à l'heure X est la consommation il y a 7 jour à la meme heure
y_naive= prep_df[PAR['TARGET_VALUE']].shift( 24 * 7 )
y_naive_test = y_naive[y_naive.index.isin(y_test.index)]
y_naive_train= y_naive[y_naive.index.isin(y_train.index)]
fig = go.Figure()
fig.add_trace( go.Scatter( x = y_test.index , y = y_test , name = 'Test'))
fig.add_trace( go.Scatter( x = y_naive_test.index , y = y_naive_test , name = 'NAIVE 7 jours'))
fig.update_layout(title = 'NAIVE MODEL VS TEST SET')
fig.show()
print( np.round( 100* np.mean( np.abs( y_naive_test - y_test ) / y_test ) ,3 ) , 'MAPE NAIVE 7 jours' )
7.88 MAPE NAIVE 7 jours
fig_hist = go.Figure()
fig_hist = px.histogram(x = y_naive_test - y_test , title = 'Distribution des erreurs' )
fig_hist.add_trace( go.Scatter( x = [0,0] , y = [0, 1000]))
Augmenter la complexité Graduellement¶
- Modele de regression lineaire
- Modele Random Forest
D'autres choix sont possibles:
- Travailler sur les parametres des modeles
- Travailler sur la forme du probleme en time serie
- Travailler avec des modeles plus complexes
Documentation
- https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
linear_regression_pipeline = Pipeline(
[
('scaler' , StandardScaler() ),
('LR' , LinearRegression())] )
linear_regression_pipeline.fit(X_train , y_train )
Pipeline(steps=[('scaler', StandardScaler()), ('LR', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('LR', LinearRegression())])StandardScaler()
LinearRegression()
random_forest_pipeline = Pipeline(
[
('scaler' , StandardScaler() ),
('RF' , RandomForestRegressor(n_estimators=50 ))] )
random_forest_pipeline.fit(X_train , y_train )
Pipeline(steps=[('scaler', StandardScaler()),
('RF', RandomForestRegressor(n_estimators=50))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('RF', RandomForestRegressor(n_estimators=50))])StandardScaler()
RandomForestRegressor(n_estimators=50)
random_forest_pipeline.predict(X_test)
array([7138.9 , 7141.7 , 7449.8 , ..., 6906.52, 6912.86, 7003.28])
Travailler les hyperparametres¶
cv_params = { 'RF__max_depth' : [6,10] ,
'RF__n_estimators' : [100,120 ] }
## Cross validation
ts_cv = TimeSeriesSplit(n_splits = 2 , gap = 24 )
## Work multiple combinations of parameters
grid_search_model = GridSearchCV(
estimator= random_forest_pipeline ,
param_grid= cv_params,
cv = ts_cv ,
scoring = make_scorer(mean_squared_error, greater_is_better= False) ,
return_train_score = True ,
verbose = 4, )
grid_search_model.fit(X_train, y_train)
Fitting 2 folds for each of 4 candidates, totalling 8 fits [CV 1/2] END RF__max_depth=6, RF__n_estimators=100;, score=(train=-158075.147, test=-192762.083) total time= 4.0s [CV 2/2] END RF__max_depth=6, RF__n_estimators=100;, score=(train=-169667.327, test=-208568.034) total time= 7.1s [CV 1/2] END RF__max_depth=6, RF__n_estimators=120;, score=(train=-157957.416, test=-192454.423) total time= 4.5s [CV 2/2] END RF__max_depth=6, RF__n_estimators=120;, score=(train=-169434.662, test=-208367.794) total time= 8.9s [CV 1/2] END RF__max_depth=10, RF__n_estimators=100;, score=(train=-79510.016, test=-180145.303) total time= 6.3s [CV 2/2] END RF__max_depth=10, RF__n_estimators=100;, score=(train=-102881.844, test=-190039.727) total time= 12.1s [CV 1/2] END RF__max_depth=10, RF__n_estimators=120;, score=(train=-79227.235, test=-180992.600) total time= 7.4s [CV 2/2] END RF__max_depth=10, RF__n_estimators=120;, score=(train=-102553.947, test=-190057.997) total time= 14.7s
GridSearchCV(cv=TimeSeriesSplit(gap=24, max_train_size=None, n_splits=2, test_size=None),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('RF',
RandomForestRegressor(n_estimators=50))]),
param_grid={'RF__max_depth': [6, 10],
'RF__n_estimators': [100, 120]},
return_train_score=True,
scoring=make_scorer(mean_squared_error, greater_is_better=False, response_method='predict'),
verbose=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=TimeSeriesSplit(gap=24, max_train_size=None, n_splits=2, test_size=None),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('RF',
RandomForestRegressor(n_estimators=50))]),
param_grid={'RF__max_depth': [6, 10],
'RF__n_estimators': [100, 120]},
return_train_score=True,
scoring=make_scorer(mean_squared_error, greater_is_better=False, response_method='predict'),
verbose=4)Pipeline(steps=[('scaler', StandardScaler()),
('RF', RandomForestRegressor(max_depth=10))])StandardScaler()
RandomForestRegressor(max_depth=10)
grid_search_model.best_params_
{'RF__max_depth': 10, 'RF__n_estimators': 100}
Compute Predictions¶
results_train = pd.DataFrame(y_train)
results_train['pred_naive_'+PAR['TARGET_VALUE']] = y_naive_train
results_train['pred_lr_'+PAR['TARGET_VALUE']] = linear_regression_pipeline.predict(X_train)
results_train['pred_rf_'+PAR['TARGET_VALUE'] ] = random_forest_pipeline.predict(X_train)
results_train['pred_rf_cv_'+PAR['TARGET_VALUE'] ] = grid_search_model.predict(X_train)
results_train['set'] = 'train'
results = pd.DataFrame(y_test)
results['pred_naive_'+PAR['TARGET_VALUE']] = y_naive_test
results['pred_lr_'+PAR['TARGET_VALUE']] = linear_regression_pipeline.predict(X_test)
results['pred_rf_'+PAR['TARGET_VALUE'] ] = random_forest_pipeline.predict(X_test)
results['pred_rf_cv_'+PAR['TARGET_VALUE'] ] = grid_search_model.predict(X_test)
results['set'] = 'test'
res = pd.concat( [ results_train , results ], ignore_index = True ).bfill()
plot_all_columns(results , graph_title = 'Predictions')
Metriques¶
res_set = []
models = []
mape_error = []
rmse_error =[]
mse_error = []
for c in res.columns[1:-1]:
for s in res.set.unique():
r_df = res[res.set == s]
res_set.append( s )
models.append( c )
mape_error.append( np.round( 100*mean_absolute_percentage_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) , 3 ))
rmse_error.append( np.round( root_mean_squared_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) , 3 ))
mse_error.append( np.round( mean_squared_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) , 3 ))
metrics_df = pd.DataFrame( data = {
'result_set' : res_set,
'models' : models,
'mape': mape_error ,
'rmse':rmse_error,
'mse': mse_error })
Metriques¶
metrics_df.pivot(columns= 'result_set' , index= 'models')
| mape | rmse | mse | ||||
|---|---|---|---|---|---|---|
| result_set | test | train | test | train | test | train |
| models | ||||||
| pred_lr_load | 6.271 | 4.692 | 560.632 | 438.676 | 314308.709 | 192436.782 |
| pred_naive_load | 7.880 | 6.257 | 728.273 | 595.701 | 530382.182 | 354859.629 |
| pred_rf_cv_load | 6.140 | 3.684 | 555.650 | 343.848 | 308747.207 | 118231.227 |
| pred_rf_load | 6.239 | 1.486 | 563.757 | 142.499 | 317822.080 | 20306.091 |
plot_all_columns(load, 'Available prediction')
load_test =load[ load.dt.dt.year == 2023 ]
print( "Available forecast Precision",
100*mean_absolute_percentage_error( load_test[PAR['TARGET_VALUE']].values , load_test['load_forecast'].values ))
Available forecast Precision 6.100084377544678